Part 1: descriptive analysis of delays using interactive US maps

(Author: Zifan Wang)

Step 0: setting up credentials for plot sharing / loading the dataset

Sys.setenv("plotly_username"="ziwang970")
Sys.setenv("plotly_api_key"="Rh542AcijT2qJ07JZsQY")
# read in dataset
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read.csv("C:/Users/ziwan/Desktop/2018 Fall Courses/BST 260/Project/dataset/flight2017.csv")

Step 1: descriptive US maps for delays by states

# calculate mean departure delay minutes by state
state_delay <- df %>%
  group_by(ORIGIN_STATE_ABR) %>%
  summarize(mean_delay = mean(DEP_DELAY_NEW, na.rm = TRUE))
# give state boundaries white borders
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# make the plot
p <- plot_geo(state_delay, locationmode = 'USA-states') %>%
  add_trace(
    z = ~mean_delay, locations = ~ORIGIN_STATE_ABR,
    color = ~mean_delay, colors = 'Purples'
  ) %>%
  colorbar(title = "Departure delay in minutes") %>%
  layout(
    title = '2017 average departure delay (minutes) by states',
    geo = g
  )

p

Step 2: descriptive US maps for delays by cities

# calculate mean departure delay minutes by city
city_delay <- df %>%
  group_by(ORIGIN_CITY_NAME) %>%
  summarize(mean_delay = mean(DEP_DELAY_NEW, na.rm = TRUE))

library(splitstackshape)
city_delay <- cSplit(city_delay, "ORIGIN_CITY_NAME", sep=",")

city_delay <- city_delay %>% mutate(name = ORIGIN_CITY_NAME_1)
# add the coordination of cities
coordinate <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')


city_delay <- city_delay %>% mutate(name = trimws(as.character(name)))

coordinate <- coordinate %>% mutate(name = trimws(as.character(name)))

merged_city_delay <- left_join(city_delay,coordinate, by='name')

merged_city_delay <- merged_city_delay %>% 
  group_by(name) %>%
  summarize(mean_delay = mean(mean_delay, na.rm = TRUE), lat = mean(lat), lon = mean(lon))
# draw the plot by cities
merged_city_delay$q <- with(merged_city_delay, cut(mean_delay, quantile(mean_delay)))
levels(merged_city_delay$q) <- paste(c("1st", "2nd", "3rd", "4th", "5th"), "Quantile")
merged_city_delay$q <- as.ordered((merged_city_delay$q))


g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showland = TRUE,
  landcolor = toRGB("gray85"),
  subunitwidth = 1,
  countrywidth = 1,
  subunitcolor = toRGB("white"),
  countrycolor = toRGB("white")
)


p <- plot_geo(merged_city_delay, locationmode = 'USA-states', sizes = c(1, 250)) %>%
  add_markers(
    x = ~lon, y = ~lat, size = ~mean_delay, color = ~q, hoverinfo = "text",
    text = ~paste(merged_city_delay$name, "<br />", merged_city_delay$mean_delay, "minutes")
  ) %>%
  layout(title = '2017 average departure delay (minutes) by city', geo = g)




p
## Warning: Ignoring 102 observations
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Step 3: descriptive US maps for delays by routes

# group by flight routes and calculate mean departure delay

route_delay <- df %>%
  group_by(ORIGIN_CITY_NAME, DEST_CITY_NAME) %>%
  summarize(mean_delay = mean(DEP_DELAY_NEW, na.rm = TRUE)) 


library(splitstackshape)
route_delay <- cSplit(route_delay, "ORIGIN_CITY_NAME", sep=",")
route_delay <- cSplit(route_delay, "DEST_CITY_NAME", sep=",")

route_delay <- route_delay %>% mutate(name1 = ORIGIN_CITY_NAME_1, name2 = DEST_CITY_NAME_1)
# add the coordination of cities
coordinate <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')

route_delay <- route_delay %>% mutate(name1 = trimws(as.character(name1)), name2 = trimws(as.character(name2)))

coordinate <- coordinate %>% mutate(name = trimws(as.character(name)))


merged_1 <- left_join(route_delay,coordinate, by = c("name1" = "name")) %>%
  rename(lat1 = lat, lon1 = lon, pop1 = pop) %>%
  select(mean_delay, name1, name2, pop1, lat1, lon1)

merged_2 <- left_join(route_delay,coordinate, by = c("name2" = "name")) %>%
  rename(lat2 = lat, lon2 = lon, pop2 = pop) %>%
  select(mean_delay, name1, name2, pop2, lat2, lon2)

merged_route_delay <- left_join(merged_1, merged_2, by = c("name1", "name2")) %>%
  rename(mean_delay = mean_delay.x) %>%
  select(mean_delay, name1, name2, pop1, lat1, lon1, pop2, lat2, lon2)

merged_route_delay <- merged_route_delay %>%      # get the mean population for each city
  group_by(name1, name2) %>%
  summarize(mean_delay = mean(mean_delay, na.rm = TRUE), 
            pop1 = mean(pop1, na.rm = TRUE), pop2 = mean(pop2, na.rm = TRUE),
            lat1 = mean(lat1, na.rm = TRUE), lon1 = mean(lon1, na.rm = TRUE),
            lat2 = mean(lat2, na.rm = TRUE), lon2 = mean(lon2, na.rm = TRUE))
# map projection

# restrict to >15, >30, >60, >90 minutes of delay
delay15 <-merged_route_delay %>%
  filter(mean_delay >= 15) 
delay30 <-merged_route_delay %>%
  filter(mean_delay >= 30) 
delay60 <-merged_route_delay %>%
  filter(mean_delay >= 60) 
delay90 <-merged_route_delay %>%
  filter(mean_delay >= 90) 

geo <- list(
  scope = 'north america',
  projection = list(type = 'azimuthal equal area'),
  showland = TRUE,
  landcolor = toRGB("gray95"),
  countrycolor = toRGB("gray80")
)


p1 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
  add_markers(
    data = delay15, x = ~lon1, y = ~lat1, text = ~name1,
    size = ~pop1, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_markers(
    data = delay15, x = ~lon2, y = ~lat2, text = ~name2,
    size = ~pop2, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_segments(
    x = ~lon1, xend = ~lon2,
    y = ~lat1, yend = ~lat2,
    
    alpha = 0.3, size = I(1), hoverinfo = "none"
  ) %>%
  layout(
    title = '2017 flight routes with >15 min delay',
    geo = geo, showlegend = FALSE) 



p2 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
  add_markers(
    data = delay30, x = ~lon1, y = ~lat1, text = ~name1,
    size = ~pop1, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_markers(
    data = delay30, x = ~lon2, y = ~lat2, text = ~name2,
    size = ~pop2, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_segments(
    x = ~lon1, xend = ~lon2,
    y = ~lat1, yend = ~lat2,
    
    alpha = 0.3, size = I(1), hoverinfo = "none"
  ) %>%
  layout(
    title = '2017 flight routes with >30 min delay',
    geo = geo, showlegend = FALSE)



p3 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
  add_markers(
    data = delay60, x = ~lon1, y = ~lat1, text = ~name1,
    size = ~pop1, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_markers(
    data = delay60, x = ~lon2, y = ~lat2, text = ~name2,
    size = ~pop2, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_segments(
    x = ~lon1, xend = ~lon2,
    y = ~lat1, yend = ~lat2,
    
    alpha = 0.3, size = I(1), hoverinfo = "none"
  ) %>%
  layout(
    title = '2017 flight routes with >60 min delay',
    geo = geo, showlegend = FALSE )


p4 <- plot_geo(locationmode = 'USA-states', color = I("red")) %>%
  add_markers(
    data = delay90, x = ~lon1, y = ~lat1, text = ~name1,
    size = ~pop1, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_markers(
    data = delay90, x = ~lon2, y = ~lat2, text = ~name2,
    size = ~pop2, hoverinfo = "text", alpha = 0.5
  ) %>%
  add_segments(
    x = ~lon1, xend = ~lon2,
    y = ~lat1, yend = ~lat2,
    
    alpha = 0.3, size = I(1), hoverinfo = "none"
  ) %>%
  layout(
    title = '2017 flight routes with >90 min delay',
    geo = geo, showlegend = FALSE )
p <- subplot(p1, p2, p3, p4, nrows = 2) %>%
  layout(title = "2017 flight routes with different delay times",
         xaxis = list(domain=list(x=c(0,0.5),y=c(0,0.5))),
         scene = list(domain=list(x=c(0.5,1),y=c(0,0.5))),
         xaxis2 = list(domain=list(x=c(0.5,1),y=c(0.5,1))),
         annotations = list(
 list(x = 0.2 , y = 1, text = ">15 mins", showarrow = F, xref='paper', yref='paper'),
  list(x = 0.8 , y = 1, text = ">30 mins", showarrow = F, xref='paper', yref='paper'),
 list(x = 0.2 , y = 0.5, text = ">60 mins", showarrow = F, xref='paper', yref='paper'),
  list(x = 0.8 , y = 0.5, text = ">90 mins", showarrow = F, xref='paper', yref='paper'))
         )
## Warning: Ignoring 262 observations
## Warning: Ignoring 244 observations
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
## Warning: Ignoring 50 observations
## Warning: Ignoring 48 observations
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
## Warning: Ignoring 13 observations
## Warning: Ignoring 22 observations
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
## Warning: Ignoring 2 observations
## Warning: Ignoring 9 observations
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
p